Recomendation system¶

In [1]:
import numpy as np 
import pandas as pd 
import random as rd 
import string

Step 2: Read Data¶

In [6]:
personal_information = pd.read_csv('personal_information.csv')
In [3]:
pred_food = pd.read_csv('pred_food.csv')
In [4]:
personal_information.head()
Out[4]:
user_id name gender age consumption_status sleeping_status height_cm weight_kg
0 1 znpdujka M 51 3 1 178.57 66.28
1 2 dciqezrl M 61 2 3 182.36 88.97
2 3 jlkbynux F 42 3 2 170.24 69.74
3 4 runpybnd F 23 1 4 158.38 60.75
4 5 kquicatc F 34 3 2 166.09 71.10
In [5]:
pred_food.head()
Out[5]:
0 1 2 3 4 5 6 7 8 9 ... 21 22 23 24 25 26 27 28 29 user_id
0 Quesadilla apples bagels chips haiku roll cupcakes hamburger (and cheeseburgers and bacon cheeseb... halibut baked beans Reuben ... franks coffee Irish stew cake gumbo crab broccoli bluefish Lamb 1
1 jerky celery clams catfish cake bison kingfish Spinach gumbo bacon ... Wine asparagus barley English muffins artichoke halibut Irish stew Milk applesauce 2
2 donuts Pepperoni granola applesauce dates jerky English muffins goose apples Venison ... BBQ kale Noodles eel sushi baked beans broccoli halibut Avocado roll hash browns 3
3 eggs Reuben Tater tots Ziti Wine jambalaya curry Venison dips Spinach ... Garlic Pancakes baked beans Yogurt Toast French toast grits falafel Quesadilla 4
4 avacado cookies almond English muffins cake Ziti honey carrots Bruscetta Wine ... gumbo Apple juice French dip goose Quesadilla broccoli apples bagels catfish 5

5 rows Ɨ 31 columns

In [9]:
personal_information.merge(pred_food)
Out[9]:
user_id name gender age consumption_status sleeping_status height_cm weight_kg 0 1 ... 20 21 22 23 24 25 26 27 28 29
0 1 znpdujka M 51 3 1 178.57 66.28 jelly / jam cheese ... coffee jalapeƱo Pepperoni BBQ buritto arugala Porter Spinach bluefish Italian bread
1 2 dciqezrl M 61 2 3 182.36 88.97 Spaghetti chocolate ... ice cream carrots bluefish fajita almond Ostrich Quiche gnocchi cheese halibut
2 3 jlkbynux F 42 3 2 170.24 69.74 Toast apples ... kingfish French dip Noodles bacon haiku roll chimichanga carne asada bread Ostrich bison
3 4 runpybnd F 23 1 4 158.38 60.75 alfalfa honey ... chicken bluefish Quiche BBQ apples kiwi bread clams halibut barley
4 5 kquicatc F 34 3 2 166.09 71.10 chicken ketchup ... ice cream hash browns broccoli carne asada Spaghetti artichoke beer hamburger (and cheeseburgers and bacon cheeseb... Garlic chocolate
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
995 996 upewyige M 69 1 4 185.94 68.28 Spaghetti asparagus ... Garlic Spinach Porter Lamb arugala granola Cabbage Walnuts Quesadilla applesauce
996 997 hiqiobms M 69 3 3 167.66 83.55 kidney beans fondu ... Meatballs carne asada Walnuts alfalfa babaganoosh Avocado roll curry chocolate crab black beans
997 998 fduqivxb M 16 3 3 175.71 62.74 buritto fondu ... French toast Ostrich Irish stew curry Lasagna dips Porter corn duck cereal
998 999 zsjmhybd M 77 3 2 186.75 86.11 coffee haiku roll ... Spaghetti lobster broccoli celery Ziti Venison cheese halibut gumbo Milk
999 1000 mqoldlhw M 2 1 4 81.81 13.28 kabobs Spaghetti ... Quesadilla buritto cereal Yogurt Moose goose dates Ostrich almond Garlic

1000 rows Ɨ 38 columns

In [18]:
personal_information.to_csv('personal_information.csv', index=False)
pred_food.to_csv('pred_food.csv',index=False)

Step 2: Data preparing for clustering¶

In [10]:
df = pd.read_csv("/Users/peiyicai/Desktop/personal_information.csv")
df.describe()
Out[10]:
user_id age consumption_status sleeping_status height_cm weight_kg
count 1000.000000 1000.0000 1000.000000 1000.000000 1000.000000 1000.000000
mean 500.500000 41.1140 2.001000 2.977000 164.322240 64.822230
std 288.819436 22.8999 0.820776 1.441023 23.568473 21.551041
min 1.000000 1.0000 1.000000 1.000000 76.370000 12.450000
25% 250.750000 22.0000 1.000000 2.000000 160.992500 55.670000
50% 500.500000 42.0000 2.000000 3.000000 170.120000 68.130000
75% 750.250000 61.0000 3.000000 4.000000 177.770000 78.692500
max 1000.000000 80.0000 3.000000 5.000000 190.000000 99.890000
In [11]:
df.nunique()
Out[11]:
user_id               1000
name                  1000
gender                   2
age                     80
consumption_status       3
sleeping_status          5
height_cm              883
weight_kg              924
dtype: int64
In [12]:
df.drop(columns='name', inplace= True)
df.head(5)
Out[12]:
user_id gender age consumption_status sleeping_status height_cm weight_kg
0 1 M 51 3 1 178.57 66.28
1 2 M 61 2 3 182.36 88.97
2 3 F 42 3 2 170.24 69.74
3 4 F 23 1 4 158.38 60.75
4 5 F 34 3 2 166.09 71.10
In [13]:
df.columns
Out[13]:
Index(['user_id', 'gender', 'age', 'consumption_status', 'sleeping_status',
       'height_cm', 'weight_kg'],
      dtype='object')
In [14]:
df.isnull().sum()
Out[14]:
user_id               0
gender                0
age                   0
consumption_status    0
sleeping_status       0
height_cm             0
weight_kg             0
dtype: int64

Data Preprocessing¶

(1) Label encoding the categorical features

(2) Scaling the features using the standard scaler

(3) Creating a subset dataframe for dimensionality reduction(PCA)

In [15]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Label encoding:

In [16]:
label_encoder = preprocessing.LabelEncoder() 
df['gender'] = label_encoder.fit_transform(df['gender'])
df.head()
Out[16]:
user_id gender age consumption_status sleeping_status height_cm weight_kg
0 1 1 51 3 1 178.57 66.28
1 2 1 61 2 3 182.36 88.97
2 3 0 42 3 2 170.24 69.74
3 4 0 23 1 4 158.38 60.75
4 5 0 34 3 2 166.09 71.10

Feature Scaling

In [17]:
df_model = df.copy()
df_model.drop(columns='user_id',inplace= True)
#Scaling
num_cols = ['age', 'consumption_status', 'sleeping_status',
       'height_cm', 'weight_kg']

for i in num_cols:    
    scaler = StandardScaler().fit(df_model[[i]])
    df_model[i] = scaler.transform(df_model[[i]])

df_model.head(5)
Out[17]:
gender age consumption_status sleeping_status height_cm weight_kg
0 1 0.431921 1.217750 -1.372628 0.604829 0.067677
1 1 0.868823 -0.001219 0.015969 0.765717 1.121053
2 0 0.038709 1.217750 -0.678330 0.251214 0.228306
3 0 -0.791404 -1.220188 0.710268 -0.252253 -0.189052
4 0 -0.310812 1.217750 -0.678330 0.075043 0.291444

Dimension reduction by using PCA:

In [18]:
pca = PCA(n_components=3)
pca.fit(df_model)
PCA_df = pd.DataFrame(pca.transform(df_model), columns=(["pca_1","pca_2", "pca_3"]))
PCA_df.describe().T
Out[18]:
count mean std min 25% 50% 75% max
pca_1 1000.0 3.375078e-17 1.518887 -2.248485 -0.969409 -0.401735 0.376758 4.607408
pca_2 1000.0 -6.328271e-18 1.019621 -1.960823 -0.830785 0.002051 0.815201 1.968591
pca_3 1000.0 -1.008083e-16 0.982167 -1.924781 -0.794694 0.000407 0.826620 1.997339

plot of the initial data after PCA:

In [19]:
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns
In [20]:
#A 3D Projection Of Data In The Reduced Dimension
x =PCA_df["pca_1"]
y =PCA_df["pca_2"]
z =PCA_df["pca_3"]

#To plot
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(x,y,z, c="maroon", marker="o" )
ax.set_title("A 3D Projection Of Data In The Reduced Dimension")
plt.show()

Step 3: Clustering¶

Elbow Method to determine the number of clusters to be formed

In [21]:
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
In [22]:
# Using elbow method to find numbers of clusters to make.
print('Elbow Method to determine the number of clusters to be formed:')
Elbow_M = KElbowVisualizer(KMeans(), k=15)
Elbow_M.fit(PCA_df)
Elbow_M.show()
Elbow Method to determine the number of clusters to be formed:
Out[22]:
<AxesSubplot:title={'center':'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>

So we use 5 as the number of clusters to do the following steps

1.Hierarchical Clustering for Customer Data¶
In [23]:
#Initiating the Agglomerative Clustering model 
AC = AgglomerativeClustering(n_clusters=5, affinity = 'euclidean', linkage ='average')


# fit model and predict clusters
yhat_AC = AC.fit_predict(PCA_df)
PCA_df["clusters_hier"] = yhat_AC

#Adding the Clusters feature to the orignal dataframe.
df["clusters_hier"]= yhat_AC
In [24]:
df.head()
Out[24]:
user_id gender age consumption_status sleeping_status height_cm weight_kg clusters_hier
0 1 1 51 3 1 178.57 66.28 0
1 2 1 61 2 3 182.36 88.97 0
2 3 0 42 3 2 170.24 69.74 0
3 4 0 23 1 4 158.38 60.75 1
4 5 0 34 3 2 166.09 71.10 0

Plot:

In [25]:
import plotly as py
import plotly.graph_objs as go
In [26]:
# 3d scatterplot using plotly
Scene = dict(xaxis = dict(title  = 'pca_1 -->'),yaxis = dict(title  = 'pca_2--->'),
             zaxis = dict(title  = 'pca_3-->'))

# AC.labels_ is nothing but the predicted clusters
x = PCA_df[['pca_1','pca_2','pca_3']].values
labels = AC.labels_
trace = go.Scatter3d(x=x[:, 0], y=x[:, 1], z=x[:, 2], mode='markers',marker=dict(color = labels, size= 10, line=dict(color= 'black',width = 10)))
layout = go.Layout(title= 'Clusters using Agglomerative Clustering',margin=dict(l=0,r=0),scene = Scene,height = 800,width = 800)
data = [trace]
fig = go.Figure(data = data, layout = layout)
fig.show()
2. KMeans¶
In [27]:
KM = KMeans(n_clusters = 5, init = "k-means++", max_iter = 300, n_init = 10, random_state = 0)
y_KM = KM.fit_predict(PCA_df)
In [28]:
PCA_df["clusters_km"] = y_KM

#Adding the Clusters feature to the orignal dataframe.
df["clusters_km"]= y_KM

Plot:

In [29]:
# 3d scatterplot using plotly
Scene = dict(xaxis = dict(title  = 'pca_1 -->'),yaxis = dict(title  = 'pca_2--->'),
             zaxis = dict(title  = 'pca_3-->'))

# AC.labels_ is nothing but the predicted clusters
x = PCA_df[['pca_1','pca_2','pca_3']].values
labels = KM.labels_
trace = go.Scatter3d(x=x[:, 0], y=x[:, 1], z=x[:, 2], mode='markers',marker=dict(color = labels, size= 10, line=dict(color= 'black',width = 10)))
layout = go.Layout(title= 'Clusters using Kmeans',margin=dict(l=0,r=0),scene = Scene,height = 800,width = 800)
data = [trace]
fig = go.Figure(data = data, layout = layout)
fig.show()
Density-based - DBSCAN¶
In [30]:
from sklearn.neighbors import NearestNeighbors # importing the library
neighb = NearestNeighbors(n_neighbors=2) # creating an object of the NearestNeighbors class
nbrs=neighb.fit(x) # fitting the data to the object
distances,indices=nbrs.kneighbors(PCA_df[['pca_1','pca_2','pca_3']]) 

# Sort and plot the distances results
distances = np.sort(distances, axis = 0) # sorting the distances
distances = distances[:, 1] # taking the second column of the sorted distances
plt.rcParams['figure.figsize'] = (5,3) # setting the figure size
plt.plot(distances) # plotting the distances
plt.show() # showing the plot
/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning:

X has feature names, but NearestNeighbors was fitted without feature names

In [31]:
db = DBSCAN(eps=0.3, min_samples=6, metric='euclidean', algorithm='auto', leaf_size=30)
y_db = db.fit_predict(PCA_df)
In [32]:
PCA_df["clusters_DBSCAN"] = y_db

#Adding the Clusters feature to the orignal dataframe.
df["clusters_DBSCAN"]= y_db
In [33]:
np.unique(db.labels_)
Out[33]:
array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20])
In [34]:
# 3d scatterplot using plotly
Scene = dict(xaxis = dict(title  = 'pca_1 -->'),yaxis = dict(title  = 'pca_2--->'),
             zaxis = dict(title  = 'pca_3-->'))

# AC.labels_ is nothing but the predicted clusters
x = PCA_df[['pca_1','pca_2','pca_3']].values
labels = db.labels_
trace = go.Scatter3d(x=x[:, 0], y=x[:, 1], z=x[:, 2], mode='markers',marker=dict(color = labels, size= 10, line=dict(color= 'black',width = 10)))
layout = go.Layout(title= 'Clusters using DBSCAN',margin=dict(l=0,r=0),scene = Scene,height = 800,width = 800)
data = [trace]
fig = go.Figure(data = data, layout = layout)
fig.show()
In [35]:
df.to_csv('personal_information_labeled.csv',index=False)

Step4: Recommendation¶

In [57]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
In [58]:
# load the dataset  (dataframe)
df = pd.read_csv('/Users/peiyicai/Desktop/personal_information_labeled.csv', encoding='utf-8')
In [59]:
df.head(12)
Out[59]:
user_id gender age consumption_status sleeping_status height_cm weight_kg clusters_hier clusters_km clusters_DBSCAN pred_food_list
0 1 1 79 1 4 174.03 83.01 1 0 0 Quesadilla, apples, bagels, chips, haiku roll,...
1 2 0 74 2 4 168.05 67.93 1 1 1 jerky, celery, clams, catfish, cake, bison, ki...
2 3 0 39 1 4 164.47 59.86 1 0 0 donuts, Pepperoni, granola, applesauce, dates,...
3 4 0 40 1 3 177.81 55.27 1 0 2 eggs, Reuben, Tater tots, Ziti, Wine, jambalay...
4 5 0 15 1 5 169.12 66.82 1 0 3 avacado, cookies, almond, English muffins, cak...
5 6 1 7 1 1 126.16 22.44 4 4 -1 jambalaya, bison, grits, cake, chicken, gumbo,...
6 7 1 75 3 5 177.44 85.26 2 1 7 Walnuts, Wine, jerky, ketchup, bagels, franks,...
7 8 0 8 1 2 122.28 25.29 0 4 21 hummus, goose, duck, Ostrich, kale, artichoke,...
8 9 1 43 3 1 172.49 95.49 2 2 4 ham, arugala, Noodles, buritto, alfalfa, Yogur...
9 10 1 68 3 3 181.15 70.93 2 1 5 French dip, Venison, gnocchi, ginger, celery, ...
10 11 0 48 3 3 165.69 64.15 2 1 5 French toast, Milkshake, Lasagna, Graham crack...
11 12 0 55 1 3 168.89 69.93 1 0 2 apples, bread, grits, antelope, broccoli, avac...
In [65]:
#predict user's 
input_user= df[df['user_id']<6].drop(['clusters_hier','clusters_km','clusters_DBSCAN','pred_food_list'],axis=1)
y_KM_user1 = KM.fit_predict(input_user)
y_KM_user1
Out[65]:
array([3, 1, 4, 2, 0], dtype=int32)
In [66]:
# get the specific subset(based on label)
#we choose the second user to do recommendation
# and we can see the label of user 2 is 1:
expected_label=1
df_sub=df[df['clusters_km']==expected_label]
In [67]:
df_sub.head(10)
Out[67]:
user_id gender age consumption_status sleeping_status height_cm weight_kg clusters_hier clusters_km clusters_DBSCAN pred_food_list
1 2 0 74 2 4 168.05 67.93 1 1 1 jerky, celery, clams, catfish, cake, bison, ki...
6 7 1 75 3 5 177.44 85.26 2 1 7 Walnuts, Wine, jerky, ketchup, bagels, franks,...
9 10 1 68 3 3 181.15 70.93 2 1 5 French dip, Venison, gnocchi, ginger, celery, ...
10 11 0 48 3 3 165.69 64.15 2 1 5 French toast, Milkshake, Lasagna, Graham crack...
13 14 1 41 3 3 183.06 86.25 2 1 5 Ostrich, kingfish, Quesadilla, French dip, Pan...
14 15 1 59 3 4 167.35 85.02 2 1 6 Lamb, ham, carne asada, franks, catfish, cooki...
15 16 0 39 3 5 167.75 61.66 2 1 7 coffee, hamburger (and cheeseburgers and bacon...
17 18 0 56 3 3 170.06 69.69 2 1 5 Venison, fajita, Porter, barley, French dip, Q...
21 22 1 36 2 5 184.28 97.44 2 1 9 Linguine, ketchup, BBQ, Lasagna, Moose, huenos...
24 25 0 13 3 3 154.22 40.12 1 1 -1 cookies, bacon, Irish stew, hamburger (and che...
In [68]:
# get the index of id
def index_from_id(df,id):

 return df[df['user_id']==id].index.values[0]

# id: the one you want to match, it can be student id or company id, if it is student, then retrieve all companys skillset, and create a dataframe,(match company for a given student)
# and if it is company id, then retrieve all students skillset, and create a dataframe(match students for a given company)
# df: dataframe needed to be make
#cosine_similarity_matrix: converted from dataframe[keyword] column
#number_of_recommendations: the number you want to match
def recommendations( id, df,cosine_similarity_matrix,number_of_recommendations):

  index = index_from_id(df,id)

  similarity_scores = list(enumerate(cosine_similarity_matrix[index]))

  similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

  recommendations_indices = [t[0] for t in similarity_scores_sorted[1:(number_of_recommendations+1)]]
  # this could be df['ID'] 
  return df[['user_id','pred_food_list']].iloc[recommendations_indices]
In [69]:
vect = CountVectorizer(stop_words='english')
# transform df['key word'] into matrix
vect_matrix = vect.fit_transform(df_sub['pred_food_list'])
vect_matrix
# get the cosine_similarity_matrix
cosine_similarity_matrix_count_based = cosine_similarity(vect_matrix, vect_matrix)
In [70]:
recommendations(2, df_sub,cosine_similarity_matrix_count_based,10)
Out[70]:
user_id pred_food_list
827 828 cupcakes, hot dogs, kale, Yogurt, granola, ice...
795 796 franks, chowder, antelope, jerky, Cabbage, Moo...
983 984 chowder, ham, kale, jelly / jam, chocolate, La...
679 680 kabobs, bread, Pizza, BBQ, Spaghetti, Quesadil...
951 952 huenos rancheros, apples, dips, bagels, Wine, ...
302 303 Quiche, Ziti, French dip, celery, Moose, fondu...
978 979 barley, apples, asparagus, jalapeƱo, almond, b...
989 990 cupcakes, ham, bread, Avocado roll, fondu, chi...
191 192 Wine, cupcakes, arugala, black beans, Venison,...
254 255 Pizza, dumplings, Ostrich, Spaghetti, artichok...
In [ ]:
 
In [ ]: